# =============================================================================
# =============================================================================
# Code to download ACS data at the zcta level (similar to zip code)
#       This data is used as controls in regressions and for heterogeneity tests
# =============================================================================
# =============================================================================


# =============================================================================
# Import ZCTA data for control variables
# =============================================================================

api_key='YOUR_API_KEY' #Create your API key here: https://api.census.gov/data/key_signup.html  see also https://www.census.gov/data/developers/guidance/api-user-guide.html
os.chdir(cdd['p_c']+r'/Census')
import CensusCustom as censusd2


def downsize(df,types):
    for var in list(set(list(df)) & set(list(types))):
#        print(var)
        if not isnan2(types[var]):
            if types[var]=='float':
                if not np.issubdtype(df[var].dtype, np.number):
                    df[var]=df[var].astype(str).replace(regex=True,to_replace=r'-',value=r'').replace('.','').replace('nan','').replace('&','').replace(' ','').replace('None','')
                df[var]=pd.to_numeric(df[var])
            elif types[var]=='integer':
                if not np.issubdtype(df[var].dtype, np.number):
                    df[var]=df[var].astype(str).replace(regex=True,to_replace=r'-',value=r'').replace('.','').replace('nan','').replace('&','').replace(' ','').replace('None','')
                df[var]=pd.to_numeric(df[var])
            elif types[var]=='cat':
                df[var]=df[var].astype('category')
            elif types[var]=='date':
                df[var]=pd.to_datetime(df[var], format='%Y-%m-%d')
    return df


# =============================================================================
# ACS __ 2014 (5/year files - Subset of counties)
#    Available for ZCTA since 2011
# =============================================================================
#Import the list of variables I want to use (their types and naming)
acs_vars = pd.read_excel(cdd['p_c'] + r'\Census\ACS_Variables.xlsx',sheet_name='ACS5Y')
acs_vars = acs_vars[acs_vars.dne==0]
acs_rndict = dict(zip(acs_vars['Variable'].to_list(),acs_vars['Rename'].to_list()))
acs_ddict =  dict(zip(acs_vars['Rename'].to_list(),acs_vars['Type'].to_list()))


zips = []
for y in range(2014,2023):
    print(y)
    acs_fields=acs_vars['Variable'].tolist()
    temp = censusd2.download(src='acs5', year=y, geo=censusd2.censusgeo([('zip code tabulation area', '*')]), var=acs_fields, key=api_key).reset_index()
    temp['year'] = y
    zips += [temp]

zips = pd.concat(zips,ignore_index=True,sort=False).rename(columns={'zip code tabulation area':'zip'})
zips['zip'] = zips.apply(lambda x: x['index'].geo[1][1] if x['year']<=2019 else x['index'].geo[0][1], axis=1)
zips['state'] = pd.to_numeric(zips.apply(lambda x: x['index'].geo[0][1] if x['year']<=2019 else np.nan, axis=1))
zips['state'] = zips.groupby('zip')['state'].transform('max') #The Census Bureau dropped the state from the geography index in 2020 but it should be constant within zcta
zips = downsize(df=zips.rename(columns=acs_rndict),types=acs_ddict)
zips = zips.drop(columns=['index'])
zips.to_parquet(cdd['p_d_acs_zip'] + r'\acs_2014_2022_s1.parquet')


# =============================================================================
# Adjust variables to get them in useful forms
# =============================================================================
acs=pd.read_parquet(cdd['p_d_acs_zip'] + r'\acs_2014_2022_s1.parquet')

acs['zip'] = pd.to_numeric(acs['zip'])
#Create the demo fields of interest.
acs = acs.replace(-666666666, np.nan)
acs['male_p']=acs['male']/acs['population']
acs['white_p']=acs['white']/acs['population']
acs['educ_sc_p']=acs[['educ_sc11m','educ_sc12m','educ_sc11f','educ_sc12f','educ_assocm','educ_assocf']].sum(axis=1,min_count=1)/acs['educ_total']
acs['educ_bach_p']=acs[['educ_bachm','educ_bachf','educ_grad1m','educ_grad1f','educ_grad2m','educ_grad2f','educ_grad3m','educ_grad3f']].sum(axis=1,min_count=1)/acs['educ_total']
acs['pv_total_r'] = acs[['pv_u50','pv_50to100','pv_100to125','pv_125to150','pv_150to185','pv_185to200','pv_a200']].sum(axis=1,min_count=1)
# acs['pv100_p']=acs[['pv_u50','pv_50to75','pv_75to100']].sum(axis=1,min_count=1)/acs['pv_total_r']
# acs['pv200_p']=acs[['pv_u50','pv_50to100','pv_100to125','pv_125to150','pv_150to185','pv_185to200']].sum(axis=1,min_count=1)/acs['pv_total_r']
acs['pvf_total_r'] = acs[['pvf_u50','pvf_50to75','pvf_75to100','pvf_100to125','pvf_125to150','pvf_150to175','pvf_175to185','pvf_185to200','pvf_200to300','pvf_300to400','pvf_400to500','pvf_a500']].sum(axis=1,min_count=1)
acs['pvf100_p']=acs[['pvf_u50','pvf_50to75','pvf_75to100']].sum(axis=1,min_count=1)/acs['pvf_total_r']
acs['pvf200_p']=acs[['pvf_u50','pvf_50to75','pvf_75to100','pvf_100to125','pvf_125to150','pvf_150to175','pvf_175to185','pvf_185to200']].sum(axis=1,min_count=1)/acs['pvf_total_r']

temp = ['hi_mu6_i','hi_m6to18_i','hi_m18to25_i','hi_m26to34_i','hi_m35to44_i','hi_m45to54_i','hi_m55to64_i','hi_m65to74_i','hi_ma75_i', 'hi_fu6_i','hi_f6to18_i','hi_f18to25_i','hi_f26to34_i','hi_f35to44_i','hi_f45to54_i','hi_f55to64_i','hi_f65to74_i','hi_fa75_i']
acs['insured_p'] = acs[temp].sum(axis=1) / acs[temp + [re.sub(r'_i',r'_ni',v) for v in temp]].sum(axis=1)
acs['snap_p']=acs['snap']/acs['hh_total']
acs['aid_p']=acs['aid']/acs['aid_total'] #Don't know the original variable
acs['pubai_p']=acs['pubai']/acs['pubai_total']
acs['comm_public_p']=acs['comm_public']/acs['comm_total']
acs['comm_car_p']=acs['comm_car']/acs['comm_total']
acs['comm_timemean']=acs['comm_timeagg']/(acs['comm_total']-acs['comm_home'])
#Calculate the mean income (has to be inflation adjusted: see https://www.census.gov/programs-surveys/acs/guidance/comparing-acs-data/2019/5-year-comparison.html)
acs['inc_hhmean'] = acs['inc_hhagg']/acs['hh_total']
acs['inc_hhmean'] = np.where(acs['year']<=2014,acs['inc_hhmean']*1.08096469,acs['inc_hhmean'])
acs['ownocc_p']=acs['housing_ownocc']/acs['housing_total']
#Employment variables
acs['emp_civilemp_p'] = acs['emp_civilemp'] / acs['emp_total']
acs['emp_civilunemp_p'] = acs['emp_civilunemp'] / acs['emp_total']
acs['emp_military_p'] = acs['emp_military'] / acs['emp_total']
acs['emp_nonlaborforce_p'] = acs['emp_nonlaborforce'] / acs['emp_total']
empt = ['empt_16to19','empt_20to24','empt_25to44','empt_45to54','empt_55to64','empt_65to69','empt_gte70']
acs['empt_ft_p'] = acs[[v+'_ft' for v in empt]].sum(axis=1) / acs['empt_total']
acs['empt_nft_p'] = acs[[v+'_nft' for v in empt]].sum(axis=1) / acs['empt_total']
acs['empt_nw_p'] = acs[[v+'_nw' for v in empt]].sum(axis=1) / acs['empt_total']

acs = acs.replace(np.inf,np.nan)
acs.to_parquet(cdd['p_d_acs_zip'] + r'\acs_2014_2022_s2.parquet')



# =============================================================================
# Merge in the zip code area and calculate annual population densities.
# =============================================================================

def weighted_qcut(values, weights, q):
    'Return weighted quantile cuts from a given series, values.'
    data=pd.DataFrame({'val':values,'w':weights.replace(0,np.nan)})
    data['index'] = 1
    data['index'] = data['index'].cumsum()
    #Collapse the duplicate values
    data_c = data.groupby(['val'])['w'].sum().reset_index()
    #Sort by the variable of interest
    data_c['wf'] = data_c['w'] / data_c['w'].sum()
    data_c.sort_values(by=['val'],inplace=True)
    data_c['wc'] = data_c['wf'].cumsum()
    if pd._libs.lib.is_integer(q):
        data_c['q'] = np.floor(data_c['wc']*q)
        data_c['q'] = data_c['q'].clip(0,q-1)
    else:
        data_c['q'] = data_c['wc'].apply(lambda x: min([v for v in q if v>x]))
    data = pd.merge(data,data_c[['val','q']],how='left',on='val')
    data['q'] = np.where((pd.isnull(data['val']))|(pd.isnull(data['w'])),np.nan,data['q'])
    return data.sort_values(by='index')['q'].values

#Use population density groups to restrict comparisons of treated and controls
zp = pd.read_excel(cdd['p_d_geo_demo'] + r'\zip_pop.xlsx')[['ZCTA5CE10','ALAND10','wlong','wlat']].rename(columns={'ZCTA5CE10':'zip','ALAND10':'sqkm'})
for v in list(zp):
    zp[v] = pd.to_numeric(zp[v])


#For each year create a temporary file and calculate the population weighted quantile bins
acs = pd.read_parquet(cdd['p_d_acs_zip'] + r'\acs_2014_2022_s2.parquet')
acs = pd.merge(acs,zp,how='left',on='zip')
acs['pop2sqkm'] = (acs['population'] / acs['sqkm'])
pop_dens = []
qvars = ['pop2sqkm','gini','mhi_quin1','inc_hhmean','educ_sc_p','educ_bach_p','pvf100_p','pvf200_p', 'insured_p', 'snap_p', 'aid_p', 'comm_car_p', 'ownocc_p', 'white_p']
for y in tqdm(acs['year'].unique()):
    temp=acs[acs.year==y][['zip','year','population']+qvars].copy()
    for v in qvars:
        temp[v+'_wq10'] = weighted_qcut(temp[v],temp['population'],10)
    pop_dens += [temp]
pop_dens = pd.concat(pop_dens,ignore_index=True,sort=False)

acs= pd.merge(acs,pop_dens.drop(columns=['population']+qvars),how='left',on=['zip','year'])

acs.to_parquet(cdd['p_d_acs_zip'] + r'\acs_2014_2022_s3.parquet')
acs.to_stata(cdd['p_d_acs_zip'] + r'\acs_2014_2022_s3.dta')


